!! Copyright (C) 2009,2010,2011,2012  Marco Restelli
!!
!! This file is part of:
!!   LDGH -- Local Hybridizable Discontinuous Galerkin toolkit
!!
!! LDGH is free software: you can redistribute it and/or modify it
!! under the terms of the GNU General Public License as published by
!! the Free Software Foundation, either version 3 of the License, or
!! (at your option) any later version.
!!
!! LDGH is distributed in the hope that it will be useful, but WITHOUT
!! ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
!! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
!! License for more details.
!!
!! You should have received a copy of the GNU General Public License
!! along with LDGH. If not, see <http://www.gnu.org/licenses/>.
!!
!! author: Marco Restelli                   <marco.restelli@gmail.com>


!> \file
!! Main program for the CG method, Navier-Stokes problem.
!!
!! \n
!!
!! This program is the counterpart of \link cg_stokes.f90
!! <tt>cg_stokes</tt>\endlink for the complete Navier-Stokes problem.
!! The implementation follows the following criteria:
!! <ul>
!!  <li> reuse as much as possible the code of \link cg_stokes.f90
!!  <tt>cg_stokes</tt>\endlink for
!!  the linear part;
!!  <li> allow the same options of \link cg_stokes.f90
!!  <tt>cg_stokes</tt>\endlink concerning available finite element
!!  spaces, stabilizations, linear solvers and so on;
!!  <li> include two families of time stepping algorithms:
!!  <ul>
!!   <li> explicit advection
!!   <li> implicit advection;
!!  </ul>
!!  <li> include upwinding methods for the advection part.
!! </ul>
!!
!! The input file is composed by two <em>namelist</em>s. The first
!! one, named \c input, is the same input namelist used for the linear
!! problem (see \link cg_stokes.f90 <tt>cg_stokes</tt>\endlink) and is
!! described in \c mod_cg_ns_setup. The second namelist is called \c
!! nsinput and specifies options for the time depend problem as
!! follows:
!! <ul>
!!  <li> \c initial_cnd_from_file: set this to <tt>.true.</tt> to read
!!  the initial condition from a file (the initial condition
!!  must then provide an array <tt>uuu</tt>)
!!  <li> \c initial_cnd_file_name: name of the file with the initial
!!  condition (only used if <tt>initial_cnd_from_file</tt>)
!!  <li> \c implicit_advection: set this to <tt>.true.</tt> to use
!!  fully implicit time stepping (including advection, the system must
!!  be factored once at each time step) and to <tt>.false.</tt> to
!!  solve implicitly only the Stokes problem (the system is factored
!!  once before the time integration)
!!  <li> \c tt_sta, \c tt_end, \c dt: initial and final times and time
!!  step
!!  <li> \c dt_out: output time interval
!!  <li> \c dt_check: checkpointing time interval
!!  <li> \c upwinding: the following upwinding schemes are currently
!!  implemented (see \link
!!  mod_cg_ns_locmat::mod_cg_ns_locmat_constructor
!!  <tt>mod_cg_ns_locmat_constructor</tt>\endlink):
!!  <ul>
!!   <li> <tt>'none'</tt>: centered scheme
!!   <li> <tt>'N-scheme'</tt>: N-scheme upwinding (only for first
!!   order velocity)
!!   <li> <tt>'PSI'</tt>: PSI upwinding (only for first order
!!   velocity).
!!   <li> <tt>'side_sc'</tt>: side based, strongly consistent interior
!!   penalty stabilization (using the stabilization coefficient
!!   <tt>u_stab_coeff</tt>)
!!  </ul>
!!  <li> \c u_stab_coeff: the side based advection stabilization
!!  coefficient (used only if <tt>upwinding.eq.'side_sc'</tt>)
!! </ul>
!!
!! The name of the input file can be specified as command argument,
!! otherwise the default \c input_file_name_def is used.
!!
!! \todo Include the side based advection stabilization in the rhs for
!! the case of explicit advection.
!<
program cg_ns_main

 !$ use mod_omp_utils, only: &
 !$   mod_omp_utils_constructor, &
 !$   mod_omp_utils_destructor,  &
 !$   mod_omp_utils_initialized, &
 !$   detailed_timing_omp, &
 !$   omput_push_key,      &
 !$   omput_pop_key,       &
 !$   omput_start_timer,   &
 !$   omput_close_timer,   &
 !$   omput_write_time

 use mod_utils, only: &
   t_realtime, my_second

 use mod_messages, only: &
   mod_messages_constructor, &
   mod_messages_destructor,  &
   error,   &
   warning, &
   info

 use mod_kinds, only: &
   wp

 use mod_fu_manager, only: &
   new_file_unit

 use mod_octave_io, only: &
   real_format, &
   write_octave, read_octave_al

 use mod_sparse, only: &
   ! sparse types
   t_col,       &
   t_tri,       &
   t_pm_sk,     &
   ! overloaded operators
   transpose,   &
   clear

 use mod_octave_io_sparse, only: &
   write_octave

 use mod_mpi_utils, only: &
   mpi_comm_world, &
   mpi_logical, mpi_integer, wp_mpi, &
   mpi_comm_size, mpi_bcast

 use mod_linsolver, only: &
   c_linpb, gmres

 use mod_output_control, only: &
   elapsed_format, &
   base_name

 use mod_time_integrators, only: &
   mod_time_integrators_constructor, &
   mod_time_integrators_destructor,  &
   bdf1_init, bdf1_step, bdf1_clean, &
   bdf2_init, bdf2_step, bdf2_clean, &
   bdf3_init, bdf3_step, bdf3_clean

 use mod_grid, only: &
   t_grid,              &
   locate_point

 use mod_cgdofs, only: &
   t_cgdofs, new_cgdofs, &
   write_octave

 use mod_base, only: &
   t_base,       &
   write_octave

 use mod_testcases, only: &
   test_name,   &
   test_description,&
   coeff_u,     &
   coeff_gradu, &
   coeff_p,     &
   coeff_init

 use mod_cg_ns_setup, only: &
   mod_cg_ns_setup_constructor, &
   mod_cg_ns_setup_destructor,  &
   ! miscellaneous
   max_char_len,   &
   ! Grid
   grid, bcs, &
   ddc_grid,  &
   ! FE space
   pressure_stabilization, add_div_lagmultiplier, &
   ku, kp,                    &
   p_stab_type, p_stab_coeff, &
   uref_nodes, pref_nodes, &
   ubase,     pbase,     &
   udofs,     pdofs,     &
   ddc_udofs, ddc_pdofs, &
   ! Linear system
   mpi_nd, mpi_id, linsolver, &
   t_nodal_field, t_linpb_mumps, linpb, &
   ! error computation
   compute_error_norms, &
   err_extra_deg,  &
   ! IO
   write_sys,      &
   out_file_nml_name

 use mod_cg_ns_locmat, only: &
   t_bcs_error

 use mod_cg_ns_linsystem, only: &
   mod_cg_ns_linsystem_constructor, &
   mod_cg_ns_linsystem_destructor,  &
   t_ls,                            &
   ns_mat, ns_partmat, ns_packsol,  &
   ns_unpacksol, trim_sol, untrim_sol

 use mod_cg_ns_ode, only: &
   mod_cg_ns_ode_constructor, &
   mod_cg_ns_ode_destructor,  &
   implicit_advection, velocity_stabilization, u_stab_coeff, &
   antisym_adv, upwinding,                                   &
   mmmt, gn, dofs_nat, dofs_dir, gdofs_nat,                  &
   uuu1, uuu2, fff, fff1, fff2, rhs1, linsys,                &
   a22ia21, a22ib2t, a22if2, a22i, a12a22i, b2a22i,          &
   t_cg_ns_ode, new_cg_ns_ode, clear,                        &
   t_linsys, t_postproc

 use mod_error_norms, only: &
   mod_error_norms_constructor, &
   mod_error_norms_destructor,  &
   velocity_err, pressure_err

!-----------------------------------------------------------------------
 
 implicit none

!-----------------------------------------------------------------------
 
 ! Parameters
 character(len=*), parameter :: input_file_name_def = 'cg-ns.in'
 character(len=*), parameter :: this_prog_name = 'cg-ns'
 character(len=max_char_len) :: input_file_name

 ! time stepping
 logical :: initial_cnd_from_file
 integer :: nstep, n_start, n
 real(wp) :: tt_sta, tt_end, dt, t_n, t_nm1
 character(len=max_char_len) :: initial_cnd_file_name
 type(t_cg_ns_ode) :: uuu0, uuun
 integer :: suold, swork! ! size of uold and work
 type(t_cg_ns_ode), allocatable :: uold(:), work(:)
 real(wp), allocatable :: uuu0_input(:)

 ! IO
 real(wp), parameter :: out_safety_factor = 1.0_wp - 1e-10_wp
 integer :: n_out, n_out2
 ! fast output at selected points
 integer :: np_fop, ne_fop ! number of FO points and elements
 integer, allocatable :: ie_fop(:)
 real(wp), allocatable :: xx_fop(:,:)
 ! auxiliary IO variables
 logical :: write_checkpoint, write_output, write_output2
 real(wp) :: time_last_out, dt_out, time_last_out2, dt_out2, &
   time_last_check, dt_check

 ! error computations
 real(wp) :: e_u_l2, e_u_h1, e_p_l2

 ! Auxiliary variables
 integer :: fu, ierr
 real(t_realtime) :: t00, t0, t1, t0step
 type(t_bcs_error) :: b_err
 character(len=max_char_len) :: message(6)

 ! Input namelist
 namelist /nsinput/ &
   ! initial condition from file
   initial_cnd_from_file, &
   initial_cnd_file_name, & ! used only if initial_cnd_from_file
   ! time stepping
   implicit_advection, &
   tt_sta, tt_end, dt, &
   ! output
   dt_out, &
   ! fast output
   dt_out2, np_fop, &
   ! chepoints
   dt_check, &
   ! Advection
   antisym_adv, &
   ! Upwinding
   upwinding, u_stab_coeff
 
 namelist /ns_fast_out/ &
   ! selected elements for the fast output
   xx_fop


 !-----------------------------------------------------------------------
 ! Initializations and stratup
 t00 = my_second()

 call mod_messages_constructor()

 !$ if(detailed_timing_omp) call mod_omp_utils_constructor()

 call mod_cg_ns_setup_constructor(input_file_name_def,input_file_name)

 ! Read the NS namelist
 call new_file_unit(fu,ierr)
 open(fu,file=trim(input_file_name), &
      status='old',action='read',form='formatted',iostat=ierr)
 read(fu,nsinput)
 close(fu,iostat=ierr)
 ! echo the input namelist
 open(fu,file=trim(out_file_nml_name),status='old',action='write', &
      form='formatted',position='append',iostat=ierr)
 write(fu,nsinput)
 close(fu,iostat=ierr)
 ! Read the ns_fast_out namelist
 if(np_fop.gt.0) then
   allocate(xx_fop(grid%d,np_fop))
   call new_file_unit(fu,ierr)
   open(fu,file=trim(input_file_name), &
        status='old',action='read',form='formatted',iostat=ierr)
   read(fu,ns_fast_out)
   close(fu,iostat=ierr)
   call locate_point(ie_fop,xx_fop,grid); deallocate(xx_fop)
   ne_fop = size(ie_fop)
 endif
 !-----------------------------------------------------------------------

 !-----------------------------------------------------------------------
 ! Linear system setup
 call mod_cg_ns_linsystem_constructor(ubase,pbase,p_stab_coeff, &
                        antisym_adv,trim(upwinding),u_stab_coeff)
 ! precompute the skeleton of the partitioned matrix
 velocity_stabilization = trim(upwinding).eq.'side_sc'
 call ns_partmat(mmmt,dofs_nat,dofs_dir,uuu0%uuu,uuu1%x,uuu2,      &
   fff,fff1,fff2,rhs1,linsys,uref_nodes,ubase,pbase,grid,udofs,    &
   pdofs,add_div_lagmultiplier,velocity_stabilization,             &
 implicit_advection,pressure_stabilization,trim(p_stab_type),b_err,&
   ddc_grid,ddc_udofs,ddc_pdofs,gn,gdofs_nat)
 if(b_err%lerr) call error(this_prog_name,'',b_err%message)

 ! initialize the linear solver
 select case(trim(linsolver))
  !case('iterative')
  ! allocate(t_linpb_it::linpb)
  ! select type(linpb); type is(t_linpb_it)
  !   pot_linpb%abstol    = itsol_abstol
  !   pot_linpb%tolerance = itsol_toll
  !   pot_linpb%nmax      = 50
  !   pot_linpb%rmax      = 20
  !   pot_linpb%mpi_id    = mpi_id
  !   pot_linpb%mpi_comm  = mpi_comm_world
  !   select case(trim(pot_itsolver))
  !    case('gmres')
  !     pot_linpb%solver => gmres
  !    case default
  !     call error(this_sub_name,this_mod_name,                  &
  !            'Unknown iterative solver "'//trim(pot_itsolver)//'".')
  !   end select
  ! end select
  case('mumps')
   allocate(t_linpb_mumps::linpb)
   select type(linpb); type is(t_linpb_mumps)
     linpb%distributed    = .true.
     linpb%poo            = 3 ! SCOTCH
     linpb%transposed_mat = .true.
     linpb%gn             = gn
     linpb%m              => mmmt%m(1,1)
     linpb%rhs            => rhs1
     linpb%gij            => gdofs_nat
     linpb%mpi_comm       = mpi_comm_world
   end select
  case default
   call error(this_prog_name,'',              &
     'Unknown solver "'//trim(linsolver)//'".')
 end select
 call linpb%factor('analysis') ! use only the skeleton
 !-----------------------------------------------------------------------

 ! For explicit advection, precompute the system matrix (with inverse)
 if(.not.implicit_advection) then

   !---------------------------------------------------------------------
   ! Assemble the matrix
   call error(this_prog_name,'', &
     'The ODE solver should include an init method which initializes'// &
     'the problem according to the chosen ode: in particular, in this'//&
     'case it is necessary to pass the parameter sigma')
   t0 = my_second()
   call ns_mat(mmmt,fff,linsys,a22ia21,a22ib2t,a22if2, ddc_grid , &
          ubase,pbase,grid,bcs,udofs,pdofs,add_div_lagmultiplier, &
 pressure_stabilization,trim(p_stab_type),velocity_stabilization, &
     b_err, sigma=huge(dt),a22i=a22i,a12a22i=a12a22i,b2a22i=b2a22i)

   if(b_err%lerr) call error(this_prog_name,'',b_err%message)
   t1 = my_second()
   write(message(1),elapsed_format) &
   'Completed assembling of the linear system: elapsed time ',t1-t0,'s.'
   call info(this_prog_name,'',message(1))
   !---------------------------------------------------------------------

   !---------------------------------------------------------------------
   ! Factorize the linear system
   t0 = my_second()

   call linpb%factor('factorization')

   t1 = my_second()
   write(message(1),elapsed_format) &
     'Completed linear system factorization: elapsed time ',t1-t0,'s.'
   call info(this_prog_name,'',message(1))
   !---------------------------------------------------------------------

 endif
 !-----------------------------------------------------------------------

 !-----------------------------------------------------------------------
 ! Set the initial condition
 if(initial_cnd_from_file) then
   if(mpi_nd.gt.1) then
     write(initial_cnd_file_name,'(a,a,i3.3)') &
          trim(initial_cnd_file_name),'.',mpi_id
   endif
   write(message(1),'(a,a,a)') &
     'Reading the initial condition from file "', &
     trim(initial_cnd_file_name),'".'
   call info(this_prog_name,'',message(1))
   call new_file_unit(fu,ierr)
   open(fu,file=trim(initial_cnd_file_name), &
        status='old',action='read',form='formatted',iostat=ierr)
   call read_octave_al(uuu0_input,'uuu',fu)
   close(fu,iostat=ierr)
   call untrim_sol(uuu0%uuu,uuu0_input,grid,udofs,pdofs,linsys)
   deallocate(uuu0_input)
 else
   ! set the initial velocity
   uuu0%uuu(1:udofs%d*udofs%ndofs) = reshape(        &
     coeff_init(udofs%x) , (/ udofs%d*udofs%ndofs /) )
   ! set the initial pressure (and the Lagrange multiplier) to 0
   uuu0%uuu(udofs%d*udofs%ndofs+1:) = 0.0_wp
   ! Note: this sets to zero also the velocity ghost dofs, if they are
   ! present. A better solution would require a separate call to
   ! initialize such degrees of freedom as well. Also, this is not
   ! completely consistent with the addition of bubble degrees of
   ! freedom, which are not Lagrangian dofs. To account for bubble
   ! degrees of freedom one should use a local projector.
 endif
 !-----------------------------------------------------------------------

 !-----------------------------------------------------------------------
 ! Setup for the fast output
 if(ne_fop.gt.0) &
   ! t_n not used in the initialization phase
   call write_selected_output( trim(base_name) , t_n , uuu0%uuu ,.true.)
 !-----------------------------------------------------------------------

 !-----------------------------------------------------------------------
 ! Time integration
 !
 !   0        1                 n             nstep
 !   |--------|--------|--------|--------|------|
 ! tt_sta            t_nm1     t_n            tt_end
 !                     | step_n |
 !

 ! Syncronization of the MPI processes.
 !  The following variables are set and/or sychronized: nstep dt
 !  tt_sta tt_end
 call init_time_stepping()

 call new_cg_ns_ode(uuun,size(uuu0%uuu))
 !n_start = 1; suold = 0; swork = 0 ! bdf1
 !n_start = 2; suold = 1; swork = 2 ! bdf2
 n_start = 3; suold = 2; swork = 3 ! bdf3
 allocate(uold(suold))
 do n=1,suold
   call new_cg_ns_ode(uold(n),size(uuu0%uuu))
 enddo
 allocate(work(swork))
 do n=1,swork
   call new_cg_ns_ode(work(n),size(uuu0%uuu))
 enddo
 !call bdf1_init(dt)
 !call bdf2_init(dt,tt_sta,uuu0,uold(1))
 call bdf3_init(dt,tt_sta,uuu0,work,uold)

 t_n = tt_sta + real(n_start-1,wp)*dt
 time_last_out   = real(n_start-1,wp)*dt; n_out  = 0
 time_last_out2  = real(n_start-1,wp)*dt; n_out2 = 0
 time_last_check = real(n_start-1,wp)*dt
 time_loop: do n=n_start,nstep
   !$ if(detailed_timing_omp) then
   !$   call omput_push_key("TimeStep")
   !$   call omput_start_timer()
   !$ endif

   t0 = my_second()
   t0step = t0

   call init_time_step()

   !---------------------------------------------------------------------
   ! Time-step
   !call bdf1_step(uuun,t_n,uuu0)
   !call bdf2_step(uuun,t_n,uuu0,work,uold(1))
   call bdf3_step(uuun,t_n,uuu0,work,uold)
   uuu0 = uuun
   !---------------------------------------------------------------------

   !---------------------------------------------------------------------
   ! Checkpoint
   if(write_checkpoint) then
     t1 = my_second()
     call check_step( t1-t0step )
   endif
   !---------------------------------------------------------------------

   !---------------------------------------------------------------------
   ! Output
   if(write_output) then
     t1 = my_second()
     n_out = n_out+1
     call write_step( n, nstep, t1-t0step, n_out, &
                   trim(base_name), t_n, uuu0%uuu )
   endif
   if(write_output2) then
     if(ne_fop.gt.0) then
       call write_selected_output(trim(base_name),t_n,uuu0%uuu,.false.)
     endif
   endif
   !---------------------------------------------------------------------

   !$ if(detailed_timing_omp) then ! TimeStep
   !$   call omput_write_time()
   !$   call omput_close_timer()
   !$   call omput_pop_key()
   !$ endif
 enddo time_loop
 !-----------------------------------------------------------------------

 !-----------------------------------------------------------------------
 ! Clean up
 call linpb%clean()
 call clear( uuu0 ); call clear( uuun )
 !call bdf1_clean()
 !call bdf2_clean()
 call bdf3_clean()
 deallocate( uold , work )
 deallocate( dofs_nat , dofs_dir , fff , gdofs_nat )
 deallocate( uuu1%x , uuu2 , fff1 , fff2 , rhs1 )
 deallocate( a22ia21 , a22ib2t , a22if2 ) 
 if(.not.implicit_advection) deallocate( a22i , a12a22i , b2a22i )
 if(ne_fop.gt.0) deallocate(ie_fop)

 call mod_cg_ns_linsystem_destructor()

 call clear( mmmt )

 call mod_cg_ns_setup_destructor()

 t1 = my_second()
 write(message(1),elapsed_format) &
   'Done: elapsed time ',t1-t00,'s.'
 call info(this_prog_name,'',message(1))

 !$ if(detailed_timing_omp) call mod_omp_utils_destructor()

 call mod_messages_destructor()
 !-----------------------------------------------------------------------

contains

 !-----------------------------------------------------------------------
 
 subroutine write_step( n, nstep, wc_time, n_out, &
                        bname, t_n, uuu )
  integer, intent(in) :: n, nstep, n_out
  real(t_realtime), intent(in) :: wc_time
  character(len=*), intent(in) :: bname
  real(wp), intent(in) :: t_n, uuu(:)

  integer :: ie, i
  real(wp) :: uuu_dg(grid%ne*ubase%pk,grid%d), ppp_dg(grid%ne*pbase%pk)
  real(wp), allocatable :: uuut(:) ! trimmed solution
  integer :: fu, ierr

  character(len=*), parameter :: time_stamp_format = '(i4.4)'
  character(len=4) :: time_stamp
  character(len=*), parameter :: suff1 = '-res-'
  character(len=*), parameter :: suff2 = '.octxt'
  character(len= len_trim(bname) + len(suff1) + 4 + len(suff2)) :: &
     out_file_res_name

   ! trim the solution eleiminating ghost dofs
   call trim_sol(uuut,uuu,grid,udofs,pdofs,linsys)

   ! DG-like output
   do ie=1,grid%ne
     do i=1,grid%d
       uuu_dg( (ie-1)*ubase%pk+1 : ie*ubase%pk , i ) = &
         uuut( (udofs%dofs(:,ie)-1)*grid%d + i )
     enddo
     ppp_dg( (ie-1)*pbase%pk+1 : ie*pbase%pk ) = &
       uuut( grid%d*udofs%ndofs + pdofs%dofs(:,ie) )
   enddo

   ! Terminal output
   write(message(1),'(a,i6,a,i6,a)') &
     'Time step ',n,' of ',nstep,'; elapsed time '
   write(message(1),elapsed_format) trim(message(1)),wc_time,'s.'
   write(message(2),elapsed_format) &
     '  solution of the linear system: ',t_linsys,'s.'
   write(message(3),elapsed_format) &
     '  post-processings:              ',t_postproc,'s.'
   call info(this_prog_name,'',message(1:3))

   ! File output
   call new_file_unit(fu,ierr)
   write(time_stamp,time_stamp_format) n_out
   out_file_res_name = trim(bname)//suff1//time_stamp//suff2
   open(fu,file=trim(out_file_res_name), &
        status='replace',action='write',form='formatted',iostat=ierr)
     ! preamble
     call write_octave(test_name       ,'test_name'       ,fu)
     call write_octave(test_description,'test_description',fu)
     call write_octave(t_n,'time',fu)
     ! data
     if(write_sys) then
       call write_octave(dofs_nat,'c','dofs_nat',fu)
       call write_octave(dofs_dir,'c','dofs_dir',fu)
       call write_octave(transpose(mmmt%m(1,1)),'mmm11',fu)
       call write_octave(transpose(mmmt%m(1,2)),'mmm21',fu)
       call write_octave(transpose(mmmt%m(2,1)),'mmm12',fu)
       call write_octave(transpose(mmmt%m(2,2)),'mmm22',fu)
       call write_octave(fff,'c','fff',fu)
     endif
     call write_octave(uuut,  'c','uuu',   fu )
     call write_octave(uuu_dg,    'uuu_dg',fu)
     call write_octave(ppp_dg,'c','ppp_dg',fu)
     if(compute_error_norms) then
       call write_octave(e_u_l2,'e_u_l2',fu)
       call write_octave(e_u_h1,'e_u_h1',fu)
       call write_octave(e_p_l2,'e_p_l2',fu)
     endif
   close(fu,iostat=ierr)

 end subroutine write_step

 !-----------------------------------------------------------------------
 
 !> Write the solution on a selected subset of elements
 !!
 !! The solution is written in a single file by appending each new
 !! time level. Only the preamble is written in octave format, since
 !! there is no easy way to write an array incrementally in octave.
 !!
 !! \note When \c init is set to <tt>.true.<\tt> the input arguments
 !! \c t_n and \c uuu are not used, so that they don't need to be
 !! defined. However, the variables \c grid, \c ubase and \c pbase
 !! need to be defined.
 !!
 !! \todo This function does not handle correctly ghost dofs, so when
 !! there are velocity ghost dofs the pressure values are not correct.
 subroutine write_selected_output( bname, t_n, uuu , init )
  character(len=*), intent(in) :: bname
  real(wp), intent(in) :: t_n, uuu(:)
  logical, intent(in) :: init
 
  integer :: ie, i
  real(wp), allocatable :: uu_sel(:,:,:), pp_sel(:,:)
  real(wp), allocatable :: uuut(:) ! trimmed solution
  character(len=10) :: uu_number, pp_number

  character(len=*), parameter :: delimiter = '#-/DELIMITER\-#'
  character(len=*), parameter :: suff1 = '-selectedres.octxt'
  character(len= len_trim(bname) + len(suff1)) :: out_file_res_name

   out_file_res_name = trim(bname)//suff1

   if(init) then
     call new_file_unit(fu,ierr)
     open(fu,file=trim(out_file_res_name), status='replace', &
       action='write',form='formatted')
      ! indexes of the selected elements
      call write_octave(ie_fop,'r','ie_fop',fu)
      ! shape of a velocity block
      call write_octave((/grid%d,ubase%pk,ne_fop/),'r','uu',fu)
      ! shape of a pressure block
      call write_octave((/   1  ,pbase%pk,ne_fop/),'r','pp',fu)
      write(fu,'(a)') delimiter
     close(fu)
     return
   endif

   ! trim the solution eleiminating ghost dofs
   call trim_sol(uuut,uuu,grid,udofs,pdofs,linsys)
 
   allocate( uu_sel(grid%d,ubase%pk,ne_fop) , &
             pp_sel(       pbase%pk,ne_fop) )
   do ie=1,ne_fop
     do i=1,grid%d
       uu_sel(i,:,ie) = uuut( (udofs%dofs(:,ie_fop(ie))-1)*grid%d + i )
     enddo
     pp_sel(:,ie) = uuut(grid%d*udofs%ndofs + pdofs%dofs(:,ie_fop(ie)))
   enddo

   write(uu_number,'(i10)') grid%d*ubase%pk*ne_fop
   write(pp_number,'(i10)')        pbase%pk*ne_fop
   call new_file_unit(fu,ierr)
   open(fu,file=trim(out_file_res_name), status='old',    &
     action='readwrite',form='formatted',position='append')
    ! current time
    write(fu,'('//real_format//')') t_n
    ! velocity
    write(fu,'('//uu_number//'('//real_format//')'//')') uu_sel
    ! pressure
    write(fu,'('//pp_number//'('//real_format//')'//')') pp_sel
   close(fu)

   deallocate( uu_sel , pp_sel , uuut )

 end subroutine write_selected_output
 
!-----------------------------------------------------------------------

 subroutine check_step(wc_timestep)
  real(t_realtime), intent(in) :: wc_timestep

   ! Terminal output
   write(message(1),'(a,i6,a,i6,a,e8.2,a,e8.2,a)') &
     'Time step ',n,' of ',nstep,'; model time ',t_n,' of ',tt_end,'.'
   write(message(2),'(a)') &
     '  CPU timings:'
   write(message(3),elapsed_format) &
     '    total elapsed time:            ',my_second()-t00,'s.'
   write(message(4),elapsed_format) &
     '    last time step:                ',wc_timestep,'s.'
   write(message(5),elapsed_format) &
     '    solution of the linear system: ',t_linsys,'s.'
   write(message(6),elapsed_format) &
     '    post-processings:              ',t_postproc,'s.'
   call info(this_prog_name,'',message(1:6))

 end subroutine check_step

 !-----------------------------------------------------------------------

 subroutine init_time_stepping()
 ! Set the main time stepping variables in a consistent way on all the
 ! processes.
 
  integer, parameter :: &
    n_ibuff = 1, &
    n_rbuff = 3
  integer ::  intgbuff(n_ibuff)
  real(wp) :: realbuff(n_rbuff)
  integer :: ierr

   if(mpi_id.eq.0) then
     nstep = ceiling((tt_end-tt_sta)/dt)
   endif

   if(mpi_nd.le.1) return

   if(mpi_id.eq.0) then ! master process

     intgbuff = (/ nstep /)
     realbuff = (/ dt , tt_sta , tt_end /)
     call mpi_bcast(intgbuff,n_ibuff,mpi_integer,0,mpi_comm_world,ierr)
     call mpi_bcast(realbuff,n_rbuff,wp_mpi     ,0,mpi_comm_world,ierr)

   else

     call mpi_bcast(intgbuff,n_ibuff,mpi_integer,0,mpi_comm_world,ierr)
     call mpi_bcast(realbuff,n_rbuff,wp_mpi     ,0,mpi_comm_world,ierr)
     nstep  = intgbuff(1)
     dt     = realbuff(1)
     tt_sta = realbuff(2)
     tt_end = realbuff(3)

   endif
   
 end subroutine init_time_stepping
 
!-----------------------------------------------------------------------

 subroutine init_time_step()
 ! Set the main time step variables in a consistent way on all the
 ! processes.
 
  integer, parameter :: &
    n_lbuff = 3, &
    n_rbuff = 2
  logical ::  logcbuff(n_lbuff)
  real(wp) :: realbuff(n_rbuff)
  integer :: ierr

   if(mpi_id.eq.0) then
     t_nm1 = t_n                  ! t^{n-1}
     t_n = tt_sta + real(n,wp)*dt ! t^n
     if(n.eq.nstep) then ! fix the last time step
       t_n = tt_end
       dt = t_n-t_nm1; call init_time_stepping() ! synchronize dt
       call warning(this_prog_name,'', &
                   'The time stepping method should include a way ' // &
                   'to reset the time-step (possibly loosing accuracy)')
       time_last_out  = huge(time_last_out )/100.0_wp
       time_last_out2 = huge(time_last_out2)/100.0_wp
     endif
     time_last_check = time_last_check + dt
     time_last_out   = time_last_out   + dt
     time_last_out2  = time_last_out2  + dt
     write_checkpoint = time_last_check .ge. out_safety_factor*dt_check
     write_output     = time_last_out   .ge. out_safety_factor*dt_out
     write_output2    = time_last_out2  .ge. out_safety_factor*dt_out2
     if(write_checkpoint) time_last_check = time_last_check - dt_check
     if(write_output)     time_last_out = time_last_out - dt_out
     if(write_output2)    time_last_out2 = time_last_out2 - dt_out2
   endif

   if(mpi_nd.le.1) return

   if(mpi_id.eq.0) then ! master process

     logcbuff = (/ write_checkpoint , write_output , write_output2 /)
     realbuff = (/ t_n , t_nm1 /)
     call mpi_bcast(logcbuff,n_lbuff,mpi_logical,0,mpi_comm_world,ierr)
     call mpi_bcast(realbuff,n_rbuff,wp_mpi     ,0,mpi_comm_world,ierr)

   else

     call mpi_bcast(logcbuff,n_lbuff,mpi_logical,0,mpi_comm_world,ierr)
     call mpi_bcast(realbuff,n_rbuff,wp_mpi     ,0,mpi_comm_world,ierr)
     write_checkpoint = logcbuff(1)
     write_output     = logcbuff(2)
     write_output2    = logcbuff(3)
     t_n              = realbuff(1)
     t_nm1            = realbuff(2)

   endif
   
 end subroutine init_time_step
 
!-----------------------------------------------------------------------

end program cg_ns_main
